# !/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.preprocessing import StandardScaler  # 导入数据标准化包
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsRegressor   # knn用于回归
from sklearn.metrics import mean_squared_error  # 计算均方根误差
from sklearn.model_selection import train_test_split


def knn_pred_house_price():
    seed = 123
    '''数据读取，特征选择'''
    dc_listings = pd.read_csv('E:\培训教程\python\唐宇迪-机器学习课程\机器学习算法配套案例实战\K近邻\listings.csv')
    pd.options.display.max_columns = 50
    features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights','number_of_reviews']
    dc_listings = dc_listings[features].head(200)

    '''价格中的字符处理并将数据类型转为float'''
    dc_listings['price'] = dc_listings.price.str.replace("$", '').replace(' ', '').astype(float)
    '''当行中有空数据时，直接删除'''
    dc_listings = dc_listings.dropna()

    '''对数据进行标准化，标准化之后，数据的均值为0，标准差为1'''
    dc_listings[features] = StandardScaler().fit_transform(dc_listings[features])
    normalized_listings = dc_listings
    print(dc_listings.mean())

    norm_train_df = normalized_listings.copy().iloc[0:150]
    norm_test_df = normalized_listings.copy().iloc[150:]

    target = normalized_listings.price
    X_train, X_test, y_train, y_test = train_test_split(normalized_listings, target, test_size=0.3, random_state=seed)

    print(X_train)
    print(X_test)
    print(y_train)
    print(y_test)

    # scipy中已经有现成的距离的计算工具了
    first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
    fifth_listing = normalized_listings.iloc[20][['accommodates', 'bathrooms']]
    first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
    print('first_fifth_distance : ', first_fifth_distance)

    '''使用Sklearn来完成KNN'''
    cols = ['accommodates', 'bedrooms']
    '''实例化knn对象'''
    knn = KNeighborsRegressor()
    '''fit函数会训练模型，使用训练集的数据，第一个参数为数据，第二个参数为标签，相当于x和y
    用指定的列，和标签之间的关系来训练模型，达到预测测试数据集的目的'''
    knn.fit(norm_train_df[cols], norm_train_df['price'])
    '''
    使用训练集训练完成之后的模型，对测试集数据进行预测
    predict会遍历norm_test_df，每一行生成一个two_features_predictions(预测值)，返回的数据类型为numpy，长度和norm_test_df相同'''
    two_features_predictions = knn.predict(norm_test_df[cols])
    '''计算实际的价格和预测出来的价格的差异,每一个真实的价格的平方和相加，在取平均是，在取0.5次幂
    计算的机构就是均方跟误差，用于评估模型的好坏，值越小，模型的准确率越高'''
    two_features_mse = mean_squared_error(norm_test_df['price'], two_features_predictions)
    two_features_rmse = two_features_mse ** (1 / 2)
    print(two_features_rmse)

    print(knn.score(norm_test_df, two_features_predictions))



if __name__ == '__main__':
    knn_pred_house_price()